# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px #Data Visualisation
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
#Explorational data analysis of Indian Trending videos
#Imported dataset
df=pd.read_csv("/kaggle/input/youtube-new/INvideos.csv")
df
df.tail()
df.shape[0] #No of records present in the csv file
df['video_id'].unique().shape[0] #No. of unique videos present in the trending list
It can be seen that the number of records do not match the number of unique videos present in the trending list. This is natural as the video can be on the trending list for any number of months or days.
df.describe()
#Null values
df.isnull().any()
new_df=df[["category_id","views","likes","dislikes","comment_count","comments_disabled","ratings_disabled","video_error_or_removed"]]
new_df #Abstracting some data
#Encoding the data
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()
#Label Encoder can only take one dimensional column
new_df.iloc[:,5]=lb.fit_transform(new_df.iloc[:,5]) #comments_disabled
new_df.iloc[:,6]=lb.fit_transform(new_df.iloc[:,6]) #ratings_disabled
new_df.iloc[:,7]=lb.fit_transform(new_df.iloc[:,7]) #video_error_or_removed
new_df
new_df.describe()
#See the correlation
import seaborn as sns
df.corr()
df=df[df["video_id"]!="#NAME?"] #removing data that has error video_id
df
sns.heatmap(df.corr(),annot=True) #Higher the amount, the more the correlation
Correlation Analysis
According to Youtube Help for what makes a video trending:
Trending aims to balance all of these considerations. To achieve this, Trending considers many signals, including (but not limited to):
With the current dataset we cannot find where the views are coming from or how the other videos from the same channel are performing because it is not specified. We can only focus on the other three considerations and see the correlation.
Extracting the types of categories
import json
with open('/kaggle/input/youtube-new/IN_category_id.json') as f: #Indian trending videos
data = json.load(f)
print(data)
categories_dict={}
print("Different types of categories: "+str(len(data["items"])))
for i in range(len(data["items"])):
cat_id=data["items"][i]["id"]
title=data["items"][i]["snippet"]["title"]
categories_dict[int(cat_id)]=title
print(categories_dict)
Mapping the category id to the category names
df['category'] = df['category_id'].map(categories_dict)
df.head(5)
arr=df["category"].unique()
sum_of_videos=[]
for i in range(len(arr)):
sum_of_videos.append(df.loc[df["category"] == arr[i]].shape[0]) #get all rows that have that category
fig = px.pie(df, values=sum_of_videos, names=arr,title="Categories of Indian trending videos")
fig.show()
Comparing it with different countries
import json
with open('/kaggle/input/youtube-new/US_category_id.json') as f: #US trending videos
data_us = json.load(f)
with open('/kaggle/input/youtube-new/DE_category_id.json') as f: #Indian trending videos
data_ger = json.load(f)
categories_dict_us={}
print("Different types of categories in US: "+str(len(data["items"])))
for i in range(len(data_us["items"])):
cat_id=data_us["items"][i]["id"]
title=data_us["items"][i]["snippet"]["title"]
categories_dict_us[int(cat_id)]=title
print(categories_dict_us)
categories_dict_ger={}
print("Different types of categories in Ger: "+str(len(data["items"])))
for i in range(len(data_ger["items"])):
cat_id=data_ger["items"][i]["id"]
title=data_ger["items"][i]["snippet"]["title"]
categories_dict_ger[int(cat_id)]=title
print(categories_dict_ger)
df_us=pd.read_csv("/kaggle/input/youtube-new/USvideos.csv") #US VIDEOS data
df_ger=pd.read_csv("/kaggle/input/youtube-new/DEvideos.csv") #German videos data
df_us['category'] = df_us['category_id'].map(categories_dict_us)
df_ger['category'] = df_ger['category_id'].map(categories_dict_ger)
arr=df_us["category"].unique()
sum_of_videos=[]
for i in range(len(arr)):
sum_of_videos.append(df_us.loc[df_us["category"] == arr[i]].shape[0]) #get all rows that have that category
fig = px.pie(df_us, values=sum_of_videos, names=arr,title="Categories of US trending videos")
fig.show()
arr=df_ger["category"].unique()
sum_of_videos=[]
for i in range(len(arr)):
sum_of_videos.append(df_ger.loc[df_ger["category"] == arr[i]].shape[0]) #get all rows that have that category
fig = px.pie(df_ger, values=sum_of_videos, names=arr,title="Categories of Germany trending videos")
fig.show()
Results
Most of the Indian trending videos belong to the category of "Entertainment". Also both the German and US trending videos had similar statistics with highest number of videos coming from the "Entertainment" category. Although percentage of the "Entertainment" category in India is more than the other two countries.
Trending videos repeating in the list
df.groupby('video_id').size().sort_values(ascending=False).reset_index(name="count").iloc[1:].head(10)
#Indicates that there are videos which come on trending list more than once
df.loc[df["video_id"]=="rRr1qiJRsXk"]
How many times does a video trend
videos_appeared_most = df.groupby('video_id').size().sort_values(ascending=False)
indexes = videos_appeared_most.index.values
tdf = df[df['video_id'].isin(indexes)].sort_values(
by='trending_date', ascending=True).drop_duplicates(subset=['video_id'])
tdf['trending_days'] = tdf['video_id'].map(videos_appeared_most)
tdf.sort_values(by='trending_days', ascending=False, inplace=True)
tdf = tdf[['video_id', 'title', 'trending_days', 'views', 'likes',
'dislikes', 'comment_count', 'category','channel_title']]
tdf.head(11)
df['trending_days'] = df['video_id'].map(videos_appeared_most)
df
arr=tdf["category"].unique()
sum_of_videos=[]
for i in range(len(arr)):
sum_of_videos.append(tdf.loc[tdf["category"] == arr[i]].shape[0]) #get all rows that have that category
fig = px.pie(tdf, values=sum_of_videos, names=arr,title="Categories of trending videos which appear more number of times on the trending list")
fig.show()
sns.heatmap(tdf.corr(),annot=True)
No. of times the video comes on trending list has some correlation. It has about 0.5 correlation which is pretty good. Hence it will be included in our model.
Videos that have more dislikes than likes
df['temp1'] = np.where((df['dislikes'] > df['likes']) ,True, False)
df.loc[df["temp1"]==True] #total of 645 rows with more dislikes than likes
Channel with highest number of videos on trending list
total_channels=df.groupby("channel_title").size().reset_index(name="video_count").sort_values("video_count", ascending=False).head(20)
total_channels #top 20 channels and their number of videos
fig = px.bar(total_channels, x="video_count", y="channel_title",color='video_count',title="Top 20 channels", labels={"video_count":"Number of videos on the trending list","channel_title":"Channel"})
fig.show()
temp1=df.loc[df["channel_title"]=="VikatanTV"].groupby("category").size().reset_index(name="cat_count")
temp1.plot.pie(y="cat_count",subplots=True, figsize=(11, 6),labels=temp1["category"]) #Channel with most trending videos having categories
Comparing with other countries
total_channels_us=df_us.groupby("channel_title").size().reset_index(name="video_count").sort_values("video_count", ascending=False).head(20)
fig = px.bar(total_channels_us, x="video_count", y="channel_title",color='video_count',title="Top 20 channels", labels={"video_count":"Number of videos of US on the trending list","channel_title":"Channel"})
fig.show()
total_channels_ger=df_ger.groupby("channel_title").size().reset_index(name="video_count").sort_values("video_count", ascending=False).head(20)
fig = px.bar(total_channels_ger, x="video_count", y="channel_title",color='video_count',title="Top 20 channels", labels={"video_count":"Number of videos of Germany on the trending list","channel_title":"Channel"})
fig.show()
temp1=df_us.loc[df_us["channel_title"]=="ESPN"].groupby("category").size().reset_index(name="cat_count")
temp1.plot.pie(y="cat_count",subplots=True, figsize=(11, 6),labels=temp1["category"]) #Channel with most trending videos in US
temp2=df_ger.loc[df_ger["channel_title"]=="Galileo"].groupby("category").size().reset_index(name="cat_count")
temp2.plot.pie(y="cat_count",subplots=True, figsize=(11, 6),labels=temp2["category"]) #Channel with most trending videos of germany
Results
To our surprise, the channel which is in the trending list with most number of videos in India is mostly belonging to the "Shows" category. Similarly, while analyzing the data of "ESPN" channel, which is the channel in US with most number of videos in the trending list, the category it published is of "Sports" category despite it's low percentage in overall categories. Even the channel in Germany has few videos from "Education" category eventhough it has less percentage.
It can indicate the fact that the reason for those videos being on the trending list is because of their "specificity", "rarity" or "relatable nature of the content to only a specific subgroup.
Although we got some surprising results in this one occasion, categories does not have that big correlation with other data. Hence, it may be relieved from the analysis.
Converting format of the trending dates
df["trending_date"]=pd.to_datetime(df["trending_date"],format="%y.%d.%m")
df["publish_time"]=pd.to_datetime(df["publish_time"])
df=df.assign(trending_day=df.trending_date.dt.day,trending_month=df.trending_date.dt.month,trending_year=df.trending_date.dt.year)
df=df.assign(publish_day=df.publish_time.dt.day,publish_month=df.publish_time.dt.month,publish_year=df.publish_time.dt.year)
df["publishing_day"] = df["publish_time"].dt.day_name()
df["publishing_hour"] = df["publish_time"].dt.hour
df
Most popular publishing day of the videos
counted=df.groupby("publishing_day").size().reset_index(name="count")
fig = px.bar(counted, x="publishing_day", y="count", title="Most popular day to publish a trending video")
fig.show()
Friday can be seen as the most popular day to publish trending videos maybe because of the start of the weekend.
Most popular publishing hour
counted=df.groupby("publishing_hour").size().reset_index(name="count")
fig = px.bar(counted, x="publishing_hour", y="count", title="Most popular time to publish a trending video")
fig.show()
2pm seems to be the time where most videos are published. Both the publishing day and published hour seem to have not that much significance. All other days and hours have similar or normal distribution.
How long till the video becomes trending?
df['temp'] = np.where((df['trending_year'] != df['publish_year']) ,True, False) #Checks if the years are different
df.loc[df["temp"]==True]
df["publish_year"].loc[df["temp"]==True].unique()
df["publish_month"].loc[df["temp"]==True].unique() #Months where this 2017 year's videos are published on
#Videos that were published in "December" of 2017 and Trended in 2018
temp1=df.loc[df["temp"]==True]
temp1=temp1.loc[temp1["publish_month"]==12]
temp1["trending_month"].unique()
#Viral videos that remerge suddenly
temp2=df.loc[df["temp"]==True]
temp2.loc[temp2["publish_month"]==5]
This shows that the trending year and the year when the video was published are different. So some videos which have been published long time ago can resurface because of their virality. Most of the viral videos, nowadays, work this way. Like in the above table it can be seen that 400 videos were published in 2017 but only 2 of them are published in the month of "May" of 2017 and trended in "January" of 2018. Other videos were trending just a month or so later after publishing in "December" which is pretty normal.
df["publish_date"]=pd.to_datetime(df["publish_time"].dt.date) #to get the date to subtract the dates
df
df["days_before_trend"]=(df["trending_date"]-df["publish_date"]).dt.days #Created the column called "days_before_trend"
df.loc[:,"trending_date":"days_before_trend"]
fig = px.box(df, y="days_before_trend")
fig.show()
temp=df[['category_id','views', 'likes','dislikes','comment_count','trending_days','days_before_trend']]
sns.heatmap(temp.corr(),annot=True)
Results: Eventhough it showed data about some outliers and how old videos can emerge as viral videos, it holds no significance or correlation with other data. So it can be removed when we are using data for prediction purposes.
Number of Views
df['views'].min() #Minimum view count
df['views'].max()#Maximum view count
df.loc[df["views"]==125432237] #Video with most number of views
fig=px.strip(data_frame=df, x=df["views"]) #Views Distribution
fig.show()
Likes
df['likes'].min() #Minimum likes count
df.loc[df["likes"]==0] #Videos with zero likes
df['likes'].max() #Maximum likes count
df.loc[df["likes"]==2912710]# Video with max no. of likes
fig=px.strip(data_frame=df, x=df["likes"]) #Likes Distribution
fig.show()
Dislikes
df['dislikes'].min() #Minimum dislikes count
df.loc[df["dislikes"]==0] #Videos with zero dislikes
It is safe to say that just because dislikes are zero it doesnot mean the same video should have zero likes. There are about 9 extra rows in such way.
df['dislikes'].max() #Maximum dislikes count
df.loc[df["dislikes"]==1545017]# Video with max no. of dislikes
fig=px.strip(data_frame=df, x=df["dislikes"]) #Dislikes Distribution
fig.show()
Comment_count
df['comment_count'].min() #Minimum comments count
df.loc[df["comment_count"]==0] #Videos with zero comments
df['comment_count'].max() #Maximum comments count
df.loc[df["comment_count"]==827755]# Video with max no. of comments
fig=px.strip(data_frame=df, x=df["comment_count"]) #comments count distribution
fig.show()
df.loc[df["comment_count"]>df["likes"]] #videos with more comments than likes
df.loc[df["comment_count"]>df["dislikes"]] #videos with more comments than dislikes
Results:
Theory behind the Unsupervise learning algorithms
Unsupervised learning algorithms are used when there is no target variable. Here we want to classify our youtube videos based on the closeness of the data. And the following algorithms use distance as a measure to calculate the clusters.
Kmeans Clustering The KMeans algorithm clusters data by trying to separate samples in n groups of equal variance, minimizing a criterion known as the inertia or within-cluster sum-of-squares. Hence the no. of clusters is calculated using the elbow method. The below WCSS graph is calculated for each value of k, i.e from 1 to 11 and optimised k is taken where there is an elbow point. In our case, the optimal number of clusters are 5 and so n=5.
Hierarchical Agglomerative Clustering Hierarchical clustering is a general family of clustering algorithms that build nested clusters by merging or splitting them successively. This hierarchy of clusters is represented as a tree (or dendrogram). The root of the tree is the unique cluster that gathers all the samples, the leaves being the clusters with only one sample. Ward minimizes the sum of squared differences within all clusters. It is a variance-minimizing approach and in this sense is similar to the k-means objective function but tackled with an agglomerative hierarchical approach.
Metrics
The above info is obtained from:
https://towardsdatascience.com/how-to-evaluate-unsupervised-learning-models-3aa85bd98aa2
K means Clustering
from sklearn import metrics
ndf=df[["video_id","likes","dislikes","views","comment_count","trending_days"]]
ndf.shape
x=ndf.values
x=x[:,1:]
x
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
Standardisation = StandardScaler()
# Scaled feature
nx = Standardisation.fit_transform(x)
nx
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
wcss=[]
for i in range(1,11):
kmeans=KMeans(n_clusters=i,init='k-means++',random_state=0)
kmeans.fit(nx)
wcss.append(kmeans.inertia_)#in the kmeans package wcss is called as inertia_
plt.plot(range(1,11),wcss)
plt.title("The elbow method")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show()
pipe= KMeans(n_clusters=5,init='k-means++',random_state=0)
ymeans=pipe.fit_predict(nx)
metrics.silhouette_score(nx, ymeans)
metrics.calinski_harabasz_score(nx, ymeans)
ymeans
ndf['Cluster1'] = ymeans
ndf
arr1=["Cluster 1","Cluster 2","Cluster 5","Cluster 3","Cluster 4"]
arr=list(set(ymeans))
sum_of_videos=[]
for i in range(len(arr)):
sum_of_videos.append(ndf.loc[ndf["Cluster1"] == arr[i]].shape[0])
fig = px.pie(ndf, values=sum_of_videos, names=arr1,title="Percent of each cluster")
fig.show()
print(sum_of_videos)
fig = px.scatter(ndf, x="views", y="likes", color="Cluster1",
size='comment_count', hover_data=['dislikes','trending_days'])
fig.show()
temp=ndf["video_id"].loc[ndf["Cluster1"]==2] #All items in Cluster 2 comes from one video
temp
sns.heatmap(ndf.corr(),annot=True)
Hierarchical Agglomerative Clustering
from sklearn.cluster import AgglomerativeClustering
pipe= AgglomerativeClustering(n_clusters=5, linkage='ward') #default= ward linkage
ymeans=pipe.fit_predict(nx)
ndf['Cluster2'] = ymeans
ndf
arr1=["Cluster 1","Cluster 3","Cluster 4","Cluster 5","Cluster 2"]
arr=list(set(ymeans))
sum_of_videos=[]
for i in range(len(arr)):
sum_of_videos.append(ndf.loc[ndf["Cluster2"] == arr[i]].shape[0])
fig = px.pie(ndf, values=sum_of_videos, names=arr1,title="Percent of each cluster")
fig.show()
print(sum_of_videos)
fig = px.scatter(ndf, x="views", y="likes", color="Cluster2",
size='comment_count', hover_data=['dislikes','trending_days'])
fig.show()
sns.heatmap(ndf.corr(),annot=True)
metrics.silhouette_score(nx, ymeans)
metrics.calinski_harabasz_score(nx, ymeans)
pipe= AgglomerativeClustering(n_clusters=5, linkage='complete') #complete linkage
ymeans=pipe.fit_predict(nx)
ndf['Cluster3'] = ymeans
ndf
arr=list(set(ymeans))
sum_of_videos=[]
for i in range(len(arr)):
sum_of_videos.append(ndf.loc[ndf["Cluster3"] == arr[i]].shape[0])
print(sum_of_videos)
metrics.silhouette_score(nx, ymeans)
metrics.calinski_harabasz_score(nx, ymeans)
pipe= AgglomerativeClustering(n_clusters=5, linkage='average') #average linkage
ymeans=pipe.fit_predict(nx)
ndf['Cluster4'] = ymeans
arr=list(set(ymeans))
sum_of_videos=[]
for i in range(len(arr)):
sum_of_videos.append(ndf.loc[ndf["Cluster4"] == arr[i]].shape[0])
print(sum_of_videos)
metrics.silhouette_score(nx, ymeans)
metrics.calinski_harabasz_score(nx, ymeans)
pipe= AgglomerativeClustering(n_clusters=5, linkage='single') #single linkage
ymeans=pipe.fit_predict(nx)
ndf['Cluster5'] = ymeans
arr=list(set(ymeans))
sum_of_videos=[]
for i in range(len(arr)):
sum_of_videos.append(ndf.loc[ndf["Cluster5"] == arr[i]].shape[0])
print(sum_of_videos)
metrics.silhouette_score(nx, ymeans)
metrics.calinski_harabasz_score(nx, ymeans)
Conclusion